{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bertopic import BERTopic\n",
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
    "import httpcore\n",
    "setattr(httpcore, 'SyncHTTPTransport', 'AsyncHTTPProxy')\n",
    "from googletrans import Translator, LANGUAGES\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 44/44 [00:00<00:00, 67.18it/s]\n",
      "100%|██████████| 18357/18357 [00:03<00:00, 5405.85it/s]\n"
     ]
    }
   ],
   "source": [
    "translator = Translator()\n",
    "\n",
    "path = \"./feed_data/full_feed_download\"\n",
    "files = os.listdir(path)\n",
    "\n",
    "def get_feed_data(x):\n",
    "    return {'feed_uri': x['uri'], \n",
    "    'creator_cid': x['cid'], \n",
    "    'feed_createdAt': x['value'].get('createdAt',\"\"),\n",
    "    'feed_description': x['value'].get('description',\"\"),\n",
    "    'feed_displayname': x['value'].get('displayName',\"\")\n",
    "    }\n",
    "\n",
    "all_feeds = []\n",
    "for file in tqdm(files):\n",
    "    with open(f\"{path}/{file}\") as f:\n",
    "        data = json.load(f)\n",
    "    for user in data:\n",
    "        for feed in data[user]:\n",
    "            feed_data = get_feed_data(feed)\n",
    "            feed_data['creator_did'] = user\n",
    "            all_feeds.append(feed_data)\n",
    "\n",
    "scraped_feeds = pd.DataFrame(all_feeds)\n",
    "scraped_feeds = scraped_feeds[~scraped_feeds[\"feed_uri\"].duplicated()].reset_index(drop = True)\n",
    "\n",
    "\n",
    "# Load the Feed Likers\n",
    "def get_likers(liker_dict):\n",
    "    results = []\n",
    "    feed_uri = list(liker_dict.keys())[0]\n",
    "    for liker in liker_dict[feed_uri]:\n",
    "        results.append({'feed_uri': feed_uri,\n",
    "                        'liker_did': liker[0],\n",
    "                        'liker_dsplayname': liker[1],\n",
    "                        'liker_description': liker[2],\n",
    "                        'liker_createdAt': liker[3]\n",
    "                         })\n",
    "    return results\n",
    "\n",
    "path = \"./feed_data/feed_likes\"\n",
    "files = os.listdir(path)\n",
    "\n",
    "with open(f\"{path}/{files[0]}\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "scraped_likers = []\n",
    "\n",
    "for file in tqdm(files):\n",
    "    with open(f\"{path}/{file}\") as f:\n",
    "        data = json.load(f)\n",
    "    for feed in data:\n",
    "        scraped_likers.extend(get_likers({feed:data[feed]}))\n",
    "\n",
    "scraped_likers = pd.DataFrame(scraped_likers)\n",
    "scraped_likers.to_csv(\"./feed_likers.csv\", index = False)\n",
    "\n",
    "\n",
    "# Function to translate text to English\n",
    "def translate_to_english(text):\n",
    "    try:\n",
    "        # Translate the text to English\n",
    "        translation = translator.translate(text, dest='en')\n",
    "        return translation.text\n",
    "    except Exception as e:\n",
    "        # Return original text if translation fails\n",
    "        return text\n",
    "\n",
    "tqdm.pandas()\n",
    "description_dict = {}\n",
    "remaining_descriptions = scraped_feeds[scraped_feeds[\"description_en\"].isna()]\n",
    "remaining_descriptions[\"description_en\"] = remaining_descriptions[\"feed_description\"].progress_apply(translate_to_english)\n",
    "description_dict.update(remaining_descriptions.set_index(\"feed_description\")[\"description_en\"].to_dict())\n",
    "scraped_feeds.loc[remaining_descriptions.index, \"description_en\"] = remaining_descriptions[\"description_en\"]\n",
    "scraped_feeds.to_csv(\"./feeds_en.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 1121/1121 [02:35<00:00,  7.21it/s]\n",
      "2024-10-30 16:54:53,461 - BERTopic - Transformed documents to Embeddings\n",
      "2024-10-30 16:55:27,023 - BERTopic - Reduced dimensionality\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-10-30 16:55:30,175 - BERTopic - Clustered reduced embeddings\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of topics: 463\n",
      "Percentage unclustered: 41.67%\n",
      "Number of descriptions: 35871\n",
      "\\begin{table}[!ht]\n",
      "\\begin{adjustwidth}{-1in}{0in}\n",
      "\\centering\n",
      "\\begin{tabular}{@{\\extracolsep{5pt}} cccccc}\n",
      "\\toprule\n",
      "\\multicolumn{3}{c}{Top Feeds by Number of Feeds} & \\multicolumn{3}{c}{Top Feeds by Number of Likes} \\\\\n",
      "\\cmidrule(r){1-3}\\cmidrule(l){4-6}\n",
      "Topic & Representation & Count & Topic & Representation & Likes \\\\\n",
      "\\cmidrule(r){1-1}\\cmidrule(lr){2-2}\\cmidrule(lr){3-3}\\cmidrule(l){4-4}\\cmidrule(lr){5-5}\\cmidrule(l){6-6}\n",
      "0 & art, my, artwork, artists, all & 612 & 17 & furry, fursuit, furrylist, furries & 9,690 \\\\\n",
      "1 & music, songs, audio, song, jazz & 394 & 346 & furry, bskyprobablyawebsite, across & 6,222 \\\\\n",
      "2 & games, game, gaming, board & 390 & 331 & follower, follow, followers, back & 5,308 \\\\\n",
      "3 & oshikapu, shobamyu, kawaii, jay & 389 & 2 & games, game, gaming, board & 4,260 \\\\\n",
      "4 & nsfw, sfw, content, warning, art & 354 & 21 & education, science, teaching & 3,881 \\\\\n",
      "5 & tracking, malifaux, pom, falcom & 338 & 29 & books, book, reading, read, readers & 3,514 \\\\\n",
      "6 & tracking, matsuura, nakayama & 338 & 28 & likes, sorted, count, liked, hours & 3,487 \\\\\n",
      "7 & words, containing, name, search & 337 & 52 & bluesky, posted, compilation, bsky & 3,449 \\\\\n",
      "8 & japanese, fgo, language, japan & 333 & 55 & lgbtq, queer, lgbtqia, gay, vibrant & 3,327 \\\\\n",
      "9 & manga, anime, illustrations, drawn & 330 & 58 & blue, sky, sorthn, fishing & 3,262 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\\caption{Top Topics by Number of Feeds and Number of Likes}\n",
      "\\label{tab:topics}\n",
      "\\end{adjustwidth}\n",
      "\\end{table}\n"
     ]
    }
   ],
   "source": [
    "likers = pd.read_csv(\"./feed_likers.csv\")\n",
    "likers_per_feed = likers.groupby(\"feed_uri\").size().to_dict()\n",
    "df = pd.read_csv(\"./feeds_en.csv\")\n",
    "\n",
    "# Remove links\n",
    "df[\"description_en\"] = df[\"description_en\"].str.replace(r'http\\S+', '', regex=True)\n",
    "# remove html tags including newlines\n",
    "df[\"description_en\"] = df[\"description_en\"].str.replace(r'<.*?>', '', regex=True).str.replace(r'\\n', ' ', regex=True)\n",
    "# Remove tokens which are entirely numbers\n",
    "df[\"description_en\"] = df[\"description_en\"].str.replace(r'\\b\\d+\\b', '', regex=True)\n",
    "# Remove non-latin characters (only a-zA-Z0-9)\n",
    "df[\"description_en\"] = df[\"description_en\"].str.replace(r'[^a-zA-Z0-9\\s]', '', regex=True)\n",
    "# Remove empty strings\n",
    "df[\"description_en\"] = df[\"description_en\"].str.strip()\n",
    "# Remove \"blueskyfeedscom\"\n",
    "#df[\"description_en\"] = df[\"description_en\"].str.replace(r'blueskyfeedscom', '', regex=False)\n",
    "df = df[df[\"description_en\"].str.len() > 1]\n",
    "text = df[\"description_en\"]\n",
    "topic_model = BERTopic(verbose=True)\n",
    "topics, probs = topic_model.fit_transform(text)\n",
    "\n",
    "print(f\"Number of topics: {len(set(topics))}\")\n",
    "print(f\"Percentage unclustered: {100 * np.mean([x == -1 for x in topics]):.2f}%\")\n",
    "print(f\"Number of descriptions: {len(text)}\")\n",
    "\n",
    "df[\"Topic\"] = topics\n",
    "df[\"number_of_likers\"] = df.feed_uri.apply(lambda x: likers_per_feed[x] if x in likers_per_feed else 0)\n",
    "\n",
    "top_topics = df.groupby(\"Topic\").size().sort_values(ascending=False).head(11)\n",
    "top_topics = top_topics[top_topics.index != -1]\n",
    "\n",
    "top_topics_num_likers = df.groupby(\"Topic\").number_of_likers.sum().sort_values(ascending=False).head(11)\n",
    "top_topics_num_likers = top_topics_num_likers[top_topics_num_likers.index != -1]\n",
    "\n",
    "topic_info = topic_model.get_topic_info()\n",
    "\n",
    "# Ensure top_topics_num_likers has a name before merging\n",
    "top_topics_num_likers.name = 'number_of_likers'\n",
    "top_topics_num_likers = topic_info.merge(top_topics_num_likers, left_on='Topic', right_index=True, how='inner')\n",
    "\n",
    "# Ensure top_topics has a name before merging\n",
    "top_topics.name = 'topic_count'\n",
    "top_topics = topic_info.merge(top_topics, left_on='Topic', right_index=True, how='inner')\n",
    "\n",
    "\n",
    "def truncate_to_length(text, max_length):\n",
    "    \"\"\"Add words to get as close to max_length as possible without exceeding it\"\"\"\n",
    "    words = text.split(', ')\n",
    "    result = []\n",
    "    current_text = \"\"\n",
    "    i = 0\n",
    "    \n",
    "    while i < len(words):\n",
    "        # Try adding the next word\n",
    "        test_text = current_text + (\", \" if current_text else \"\") + words[i]\n",
    "        \n",
    "        if len(test_text) <= max_length:\n",
    "            # Word fits, add it\n",
    "            current_text = test_text\n",
    "            result.append(words[i])\n",
    "            i += 1\n",
    "        else:\n",
    "            # Word doesn't fit, try next word if it's shorter\n",
    "            next_word_idx = i + 1\n",
    "            while next_word_idx < len(words):\n",
    "                test_text = current_text + (\", \" if current_text else \"\") + words[next_word_idx]\n",
    "                if len(test_text) <= max_length:\n",
    "                    # Found a shorter word that fits\n",
    "                    current_text = test_text\n",
    "                    result.append(words[next_word_idx])\n",
    "                    i = next_word_idx + 1\n",
    "                    break\n",
    "                next_word_idx += 1\n",
    "            if next_word_idx >= len(words):\n",
    "                # No more words fit\n",
    "                break\n",
    "                \n",
    "    return current_text\n",
    "print(r\"\\begin{table}[!ht]\")\n",
    "print(r\"\\begin{adjustwidth}{-1in}{0in}\")\n",
    "print(r\"\\centering\")\n",
    "print(r\"\\begin{tabular}{@{\\extracolsep{5pt}} cccccc}\")\n",
    "print(r\"\\toprule\")\n",
    "print(r\"\\multicolumn{3}{c}{Top Feeds by Number of Feeds} & \\multicolumn{3}{c}{Top Feeds by Number of Likes} \\\\\")\n",
    "print(r\"\\cmidrule(r){1-3}\\cmidrule(l){4-6}\")\n",
    "print(r\"Topic & Representation & Count & Topic & Representation & Likes \\\\\")\n",
    "print(r\"\\cmidrule(r){1-1}\\cmidrule(lr){2-2}\\cmidrule(lr){3-3}\\cmidrule(l){4-4}\\cmidrule(lr){5-5}\\cmidrule(l){6-6}\")\n",
    "\n",
    "# Sort top_topics_num_likers by number of likers\n",
    "top_topics_num_likers = top_topics_num_likers.sort_values('number_of_likers', ascending=False)\n",
    "\n",
    "# Zip the rows together\n",
    "for (_, row1), (_, row2) in zip(top_topics.iterrows(), top_topics_num_likers.iterrows()):\n",
    "   rep1 = \", \".join(row1['Representation'][:5]).replace('_', ' ')\n",
    "   rep1 = truncate_to_length(rep1, 35)\n",
    "   topic1 = row1['Topic']\n",
    "   count1 = \"{:,}\".format(int(row1['topic_count']))\n",
    "   \n",
    "   rep2 = \", \".join(row2['Representation'][:5]).replace('_', ' ')\n",
    "   rep2 = truncate_to_length(rep2, 35)\n",
    "   topic2 = row2['Topic']\n",
    "   likes2 = \"{:,}\".format(int(row2['number_of_likers']))\n",
    "   \n",
    "   print(f\"{topic1} & {rep1} & {count1} & {topic2} & {rep2} & {likes2} \\\\\\\\\")\n",
    "\n",
    "print(r\"\\bottomrule\")\n",
    "print(r\"\\end{tabular}\")\n",
    "print(r\"\\caption{Top Topics by Number of Feeds and Number of Likes}\")\n",
    "print(r\"\\label{tab:topics}\")\n",
    "print(r\"\\end{adjustwidth}\")\n",
    "print(r\"\\end{table}\")\n",
    "\n",
    "\n",
    "df.to_csv(\"./feeds_with_topics.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of topics: 463\n",
      "Number of unclustered: 14946\n",
      "Percentage unclustered: 41.67%\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv(\"./feeds_with_topics.csv\", \n",
    "                 quotechar='\"',        # Specify quote character\n",
    "                 escapechar='\\\\',      # Specify escape character\n",
    "                 skip_blank_lines=True, # Skip empty lines\n",
    "                 lineterminator='\\n')   # Specify line terminator\n",
    "number_of_topics = df.Topic.nunique()\n",
    "number_of_unclustered = df[df.Topic == -1].shape[0]\n",
    "percentage_unclustered = number_of_unclustered / df.shape[0]\n",
    "print(f\"Number of topics: {number_of_topics}\")\n",
    "print(f\"Number of unclustered: {number_of_unclustered}\")\n",
    "print(f\"Percentage unclustered: {100*percentage_unclustered:.2f}%\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
